﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;

namespace Chapleau.MicrodataParser
{
    public class Parser
    {
        public static List<MicrodataItem> Parse(string html, bool removeScriptandStyles)
        {
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(html);

            if (removeScriptandStyles)
                doc.DocumentNode.Descendants()
                    .Where(n => n.Name == "script" || n.Name == "style")
                    .ToList()
                    .ForEach(n => n.Remove());

            var topLevelItems = doc.DocumentNode.Descendants().Where(n => n.Attributes.Contains("itemscope") && !n.Attributes.Contains("itemprop"));

            List<MicrodataItem> mdis = new List<MicrodataItem>();

            foreach (var items in topLevelItems)
                mdis.Add(GetMicroDataItem(doc, items));

            return mdis;
        }

        private static MicrodataItem GetMicroDataItem(HtmlDocument doc, HtmlNode htmlItem)
        {
            MicrodataItem item = new MicrodataItem();

            if (htmlItem.Attributes.Contains("itemtype"))
                htmlItem.Attributes["itemtype"].Value.Split(' ').ToList().ForEach(it => item.ItemType.Add(it));

            if (htmlItem.Attributes.Contains("itemid"))
                item.ItemId = htmlItem.Attributes["itemid"].Value;

            item.Properties = GetPropertiesValues(doc, GetProperties(doc, htmlItem));

            return item;
        }

        private static List<MicrodataBase> GetPropertiesValues(HtmlDocument doc, List<HtmlNode> properties)
        {
            List<MicrodataBase> propertiesValues = new List<MicrodataBase>();
            foreach (var property in properties)
            {
                string propertyName = property.Name.ToLower();
                string name = property.Attributes["itemprop"].Value;
                MicrodataItem item = null;
                string value = string.Empty;

                if (property.Attributes.Contains("itemscope"))
                    item = GetMicroDataItem(doc, property);
                else if (propertyName == "meta")
                    value = property.Attributes["content"].Value;
                else if ("audio,embed,iframe,img,source,track,video,".Contains(propertyName + ","))
                    value = property.Attributes["src"].Value;
                else if ("a,area,link,".Contains(propertyName + ","))
                    value = property.Attributes["href"].Value;
                else if (propertyName == "object")
                    value = property.Attributes["data"].Value;
                else if (propertyName == "data")
                    value = property.Attributes["value"].Value;
                else if (propertyName == "time")
                    value = property.InnerText;

                // This is not from the Specifications, but some sites are using it.
                else if (property.Attributes.Contains("content"))
                    value = HtmlEntity.DeEntitize(property.Attributes["content"].Value).Trim();
                else
                    value = HtmlEntity.DeEntitize(property.InnerText).Trim();

                if (item != null)
                    propertiesValues.Add(item);

                if(!string.IsNullOrWhiteSpace(value))
                    propertiesValues.Add(new MicrodataKeyValuePair { Name = name, Value = value });
            }
            return propertiesValues;
        }

        private static List<HtmlNode> GetProperties(HtmlDocument doc, HtmlNode htmlItem)
        {
            Queue<HtmlNode> memory = new Queue<HtmlNode>();
            List<HtmlNode> results = new List<HtmlNode>();
            Queue<HtmlNode> pending = new Queue<HtmlNode>();

            memory.Enqueue(htmlItem);
            foreach (var children in htmlItem.ChildNodes)
                pending.Enqueue(children);

            if (htmlItem.Attributes.Contains("itemref"))
                foreach (string itemref in htmlItem.Attributes["itemref"].Value.Split(' '))
                    pending.Enqueue(doc.DocumentNode.SelectSingleNode("//" + itemref));

            while (pending.Count > 0)
            {
                var currentItem = pending.Dequeue();

                // If the node is already added, skip it
                if (memory.Contains(currentItem))
                    continue;
                memory.Enqueue(currentItem);

                // If the node is not an ItemScope, enqueue it
                if (!currentItem.Attributes.Contains("itemscope"))
                    foreach (var children in currentItem.ChildNodes)
                        pending.Enqueue(children);

                // If the node is an itemprop, it's... an itemprop
                if (currentItem.Attributes.Contains("itemprop"))
                    results.Add(currentItem);
            }

            return results;
        }
    }
}
